Classification of reports to different level of Suspiciousness, based on underlying relationship of ProductID & SalesmanID to be used for sales and productivity analysis.
Flag transactions as Suspicious with absolute certainty, to take corrective actions.
To develop an error/fraud detection software, you are required to develop a method to determine suspicious transactions which have high propensity to be error/fraud. The goal is to verify the veracity of these reports given past experience of the company that has detected both errors and fraud attempts in these transaction reports.
We are expected to create an analytical and modeling framework to predict the propensity of suspicious transaction of each report
Data consists of the transaction reports submitted by the salesmen which includes the salesman ID, report ID, product ID, the quantity and the total value reported by the salesman.
Multi-Class Classification.
This Notebook addresses to Taks1 : To do exploratory Data Analysis using visualizations
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
import warnings
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as impipe
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_curve, auc, accuracy_score,classification_report, recall_score,precision_score,precision_recall_curve,average_precision_score, silhouette_score,roc_curve, auc,confusion_matrix,mean_absolute_error,mean_squared_error,roc_auc_score,f1_score, precision_recall_fscore_support
from sklearn.model_selection import learning_curve
import graphviz
from sklearn import tree
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
import math
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import graphviz
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, fbeta_score, recall_score
from sklearn.preprocessing import LabelBinarizer
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
def set_env_var():
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')
os.environ['KMP_DUPLICATE_LIB_OK']='True'
pd.set_option('display.max_rows', 60)
pd.set_option('display.max_columns', 60)
init_notebook_mode(connected=True)
%matplotlib inline
set_env_var()
def set_seed(seed=707):
return random.seed(seed)
seed_constant = 122
set_seed(seed_constant)
def data_importer_xlsx(data_path, na_values=[]):
dataF = pd.read_excel(data_path, na_values=na_values)
print("The number of Rows in the Data set = "+str(dataF.shape[0]))
print("The number of Columns in the data set = " +str(dataF.shape[1]))
return dataF
salesReportData=data_importer_xlsx("/Users/samyam/Documents/Samya/GIT/insofe/Final_PHD/PHD_TrainData_and_ProblemStatement_actual/Train.xlsx")
salesReportDataTrain, salesReportDataTest = train_test_split(salesReportData, test_size=0.25, random_state=seed_constant, stratify=salesReportData['Suspicious'])
print("Train Set shape :",salesReportDataTrain.shape)
print("Test Set shape :",salesReportDataTest.shape)
def data_description(dataF):
print("The columns in the data set are : \n",list(dataF.columns))
print("The data types of the columns are :\n\n",dataF.dtypes)
data_description(salesReportDataTrain)
salesReportDataTrain.head()
salesReportDataTrain.tail()
def get_data_sample(dataF, num_sample=10):
return pd.DataFrame(dataF.sample(num_sample))
get_data_sample(salesReportDataTrain)
def plot_data_dist_across_targ(dataF, target):
"""
Plot target variable distribution
"""
print(dataF[target].value_counts())
print((dataF[target].value_counts()/dataF[target].count())*100)
plt.figure(figsize=(12,4))
plt.style.use('seaborn-ticks')
plot_1 = sns.countplot(y=target, data=dataF, order = dataF[target].value_counts().index);
plot_1.axes.set_title("Target Variable Distribution",fontsize=20);
plot_1.set_xlabel("Count",fontsize=20);
plot_1.set_ylabel("Target Variable",fontsize=20);
plot_1.tick_params(labelsize=15);
plt.show();
plot_data_dist_across_targ(salesReportDataTrain, "Suspicious")
def get_null_count_per_attribute(dataF):
return pd.DataFrame({'total_missing': dataF.isnull().sum(),
'perc_missing': (dataF.isnull().sum()/dataF.shape[0])*100}).sort_values(
by=['perc_missing'], ascending=False)
get_null_count_per_attribute(salesReportDataTrain)
def get_unique_value_count_per_attribute(dataF):
return pd.DataFrame((dataF.nunique()/dataF.shape[0])*100).rename(
{0: 'perc_unique'}, axis=1).sort_values(by=['perc_unique'])
get_unique_value_count_per_attribute(salesReportDataTrain)
salesReportDataTrain_var_acr_Sus_lev = pd.pivot_table(
salesReportDataTrain, values=['Quantity', 'TotalSalesValue'], index=['ProductID'],
columns=['Suspicious'], aggfunc={'Quantity': np.sum,'TotalSalesValue': np.sum})
salesReportDataTrain_var_acr_Sus_lev.head()
salesReportDataTrain_var_acr_Sus_lev_1 = salesReportDataTrain_var_acr_Sus_lev['TotalSalesValue']/salesReportDataTrain_var_acr_Sus_lev['Quantity']
salesReportDataTrain_var_acr_Sus_lev_1.head()
#Check for Products Suspicious='YES' only
data_sample = salesReportDataTrain_var_acr_Sus_lev_1[salesReportDataTrain_var_acr_Sus_lev_1['Yes'].notnull()]
trace_yes = go.Bar(x=data_sample.index ,
y= data_sample.Yes,
name = 'Yes')
trace_no = go.Bar(x=data_sample.index ,
y= data_sample.No,
name = 'No')
trace_indeterminate = go.Bar(x=data_sample.index ,
y= data_sample.indeterminate,
name = 'indeterminate')
layout = go.Layout(title = 'Distribution of Average TotalSalesValue across Suspicious level',
xaxis = dict(title = 'ProductID'),
yaxis = dict(title = 'Average TotalSalesValue'))
data = [trace_yes, trace_no, trace_indeterminate]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Observation
# SalesValuePerItem, represents selling price of each ProductID for a given report
salesReportDataTrain_feature_added = salesReportDataTrain.copy()
salesReportDataTrain_feature_added.insert(4, 'SalesValuePerItem',
salesReportDataTrain_feature_added['TotalSalesValue']/salesReportDataTrain_feature_added['Quantity'])
salesReportDataTrain_feature_added.head()
# Identify Suspicious SalesPersonID, based on frequency of fault
salesReportDataTrain_feature_added_by_SalesPersonID_1 = salesReportDataTrain_feature_added[
['SalesPersonID', 'Suspicious']].groupby(['SalesPersonID']).apply(
lambda x: (x['Suspicious']== 'Yes').sum()).reset_index(name='countOfSuspiciousEQYesPerGrp')
salesReportDataTrain_feature_added_by_SalesPersonID_2 = salesReportDataTrain_feature_added[
['SalesPersonID', 'Suspicious']].groupby(['SalesPersonID']).count().reset_index().rename(
columns={'Suspicious':'countOfSuspiciousPerGrp'})
salesReportDataTrain_feature_added_by_SalesPersonID_3 = salesReportDataTrain_feature_added_by_SalesPersonID_1.merge(
salesReportDataTrain_feature_added_by_SalesPersonID_2, on='SalesPersonID', how='outer')
salesReportDataTrain_feature_added_by_SalesPersonID_3['percOfSuspiciousEQYes'] = (
salesReportDataTrain_feature_added_by_SalesPersonID_3['countOfSuspiciousEQYesPerGrp']
/salesReportDataTrain_feature_added_by_SalesPersonID_3['countOfSuspiciousPerGrp'])*100
salesReportDataTrain_feature_added_by_SalesPersonID_3.sort_values(['percOfSuspiciousEQYes'], ascending=[0]).head()
# Identify Suspicious ProductID, based on frequency of fault
salesReportDataTrain_feature_added_by_ProductID_1 = salesReportDataTrain_feature_added[
['ProductID', 'Suspicious']].groupby(
['ProductID']).apply(lambda x: (x['Suspicious']== 'Yes').sum()).reset_index(
name='countOfSuspiciousEQYesPerGrp')
salesReportDataTrain_feature_added_by_ProductID_2 = salesReportDataTrain_feature_added[
['ProductID', 'Suspicious']].groupby(
['ProductID']).count().reset_index().rename(columns={'Suspicious':'countOfSuspiciousPerGrp'})
salesReportDataTrain_feature_added_by_ProductID_3 = salesReportDataTrain_feature_added_by_ProductID_1.merge(
salesReportDataTrain_feature_added_by_ProductID_2, on='ProductID', how='outer')
salesReportDataTrain_feature_added_by_ProductID_3['percOfSuspiciousEQYes'] = (
salesReportDataTrain_feature_added_by_ProductID_3['countOfSuspiciousEQYesPerGrp']/
salesReportDataTrain_feature_added_by_ProductID_3['countOfSuspiciousPerGrp'])*100
salesReportDataTrain_feature_added_by_ProductID_3.sort_values(['percOfSuspiciousEQYes'], ascending=[0]).head()
Observation
Neither any ProductID nor any SalesPersonID have direct relation with Suspicious=YES
salesReportDataTrain_feature_added_grp_by_SPS = salesReportDataTrain_feature_added[[
'SalesPersonID', 'ProductID', 'Suspicious', 'SalesValuePerItem']].groupby(
['SalesPersonID', 'ProductID', 'Suspicious'], sort=False).mean().reset_index()
salesReportDataTrain_feature_added_grp_by_SPS_pivot = salesReportDataTrain_feature_added_grp_by_SPS.groupby(
['SalesPersonID', 'ProductID', 'Suspicious'])['SalesValuePerItem'].aggregate('first').unstack().rename(
columns=str).reset_index()
# Drop any row with all the following columns NAN (indeterminate, No, Yes)
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1 = salesReportDataTrain_feature_added_grp_by_SPS_pivot.dropna(
subset=['indeterminate', 'No', 'Yes'], how='all')
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1.head()
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1['max_suspicious_level'] = salesReportDataTrain_feature_added_grp_by_SPS_pivot_1[['indeterminate', 'No', 'Yes']].idxmax(axis=1)
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1['min_suspicious_level'] = salesReportDataTrain_feature_added_grp_by_SPS_pivot_1[['indeterminate', 'No', 'Yes']].idxmin(axis=1)
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2 = salesReportDataTrain_feature_added_grp_by_SPS_pivot_1.loc[
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1['Yes'].notnull()]
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2.head()
# For SalesPersonID and ProductID group, for rows having Suspicious=YES is there a pattern ?
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2[(
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['max_suspicious_level']=='Yes') | (
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['min_suspicious_level']=='Yes')].shape[0]
# Scenarios when Suspicious=YES, but the SalesValuePerItem is min or max for a specific
# SalesPersonIDand ProductID group
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2[(
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['max_suspicious_level']!='Yes') & (
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['min_suspicious_level']!='Yes')].shape[0]
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2[(
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['max_suspicious_level']!='Yes') & (
salesReportDataTrain_feature_added_grp_by_SPS_pivot_2['min_suspicious_level']!='Yes')]
Observations
salesReportDataTrain_feature_added_grp_by_SPS_pivot_1['ProductID_SalesPersonID'] = salesReportDataTrain_feature_added_grp_by_SPS_pivot_1[['ProductID', 'SalesPersonID']].apply(lambda x: '_'.join(x), axis=1)
data_sample = salesReportDataTrain_feature_added_grp_by_SPS_pivot_1
trace_yes = go.Scatter(x=data_sample.ProductID_SalesPersonID ,
y= data_sample.Yes,
name = 'Yes',
mode= 'markers')
trace_no = go.Scatter(x=data_sample.ProductID_SalesPersonID ,
y= data_sample.No,
name = 'No',
mode= 'markers')
trace_indeterminate = go.Scatter(x=data_sample.ProductID_SalesPersonID ,
y= data_sample.indeterminate,
name = 'indeterminate',
mode= 'markers')
layout = go.Layout(title = 'Distribution of Avg TotalSalesValue across Suspicious level, groupby ProductID and SalesPersonID',
xaxis = dict(title = 'ProductID_SalesPersonID',titlefont=dict(size=10), tickfont=dict(size=7)),
yaxis = dict(title = 'Average TotalSalesValue for each Suspicious level',titlefont=dict(size=10)))
data = [trace_yes, trace_no, trace_indeterminate]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
salesReportDataTrain_feature_added = salesReportDataTrain.copy()
salesReportDataTrain_feature_added.insert(5, 'SalesValuePerItem',
salesReportDataTrain_feature_added['TotalSalesValue']/salesReportDataTrain_feature_added['Quantity'])
salesReportDataTrain_feature_added.head()
salesReportDataTrain_grp_productID = salesReportDataTrain_feature_added.groupby('ProductID').agg(
{'TotalSalesValue':'sum','Quantity':'sum'}).reset_index().rename(columns={
'TotalSalesValue':'TotalSalesValueByProduct',
'Quantity':'QuantityByProduct'
})
salesReportDataTrain_grp_productID['AverageSalesValueByProduct'] = salesReportDataTrain_grp_productID['TotalSalesValueByProduct']/salesReportDataTrain_grp_productID['QuantityByProduct']
salesReportDataTrain_grp_productID.head()
# Append the AverageSalesValueByProduct to dataset
salesReportDataTrain_add_feature_grp_P = salesReportDataTrain_feature_added.merge(
salesReportDataTrain_grp_productID, on='ProductID', how='outer')
salesReportDataTrain_add_feature_grp_P.head()
#Check for Products Suspicious='YES' only
data_sample = salesReportDataTrain_add_feature_grp_P[salesReportDataTrain_add_feature_grp_P['Suspicious']=='Yes'].sample(10)
trace_SalesValuePerItem = go.Bar(x=data_sample.ReportID ,
y= data_sample.SalesValuePerItem,
name = 'SalesValuePerItem')
trace_TotalSalesValueByProduct = go.Bar(x=data_sample.ReportID ,
y= data_sample.AverageSalesValueByProduct,
name = 'AverageSalesValueByProduct')
layout = go.Layout(title = 'Distribution of SalesValuePerItem per transaction vs AverageSalesValueByProduct (For Suspicious=YES)',
xaxis = dict(title = 'ReportID', tickangle = -45),
yaxis = dict(title = 'Average TotalSalesValue By Product'))
data = [trace_SalesValuePerItem, trace_TotalSalesValueByProduct]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
#Check for Products Suspicious='No' only
data_sample = salesReportDataTrain_add_feature_grp_P[salesReportDataTrain_add_feature_grp_P['Suspicious']=='No'].sample(10)
trace_SalesValuePerItem = go.Bar(x=data_sample.ReportID ,
y= data_sample.SalesValuePerItem,
name = 'SalesValuePerItem')
trace_TotalSalesValueByProduct = go.Bar(x=data_sample.ReportID ,
y= data_sample.AverageSalesValueByProduct,
name = 'AverageSalesValueByProduct')
layout = go.Layout(title = 'Distribution of SalesValuePerItem per transaction vs AverageSalesValueByProduct (For Suspicious=NO)',
xaxis = dict(title = 'ReportID', tickangle = -45),
yaxis = dict(title = 'Average TotalSalesValue By Product'))
data = [trace_SalesValuePerItem, trace_TotalSalesValueByProduct]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
#Check for Products Suspicious='indeterminate' only
data_sample = salesReportDataTrain_add_feature_grp_P[salesReportDataTrain_add_feature_grp_P['Suspicious']=='indeterminate'].sample(10)
trace_SalesValuePerItem = go.Bar(x=data_sample.ReportID ,
y= data_sample.SalesValuePerItem,
name = 'SalesValuePerItem')
trace_TotalSalesValueByProduct = go.Bar(x=data_sample.ReportID ,
y= data_sample.AverageSalesValueByProduct,
name = 'AverageSalesValueByProduct')
layout = go.Layout(title = 'Distribution of SalesValuePerItem per transaction vs AverageSalesValueByProduct (For Suspicious=indeterminate)',
xaxis = dict(title = 'ReportID', tickangle = -45),
yaxis = dict(title = 'Average TotalSalesValue By Product'))
data = [trace_SalesValuePerItem, trace_TotalSalesValueByProduct]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
#### Get Deviation of SalesValuePerItem from AverageSalesValueByProduct
salesReportDataTrain_add_feature_grp_P['SalesValuePerItemVariationFromAvgProduct'] = (((salesReportDataTrain_add_feature_grp_P['AverageSalesValueByProduct'] -
salesReportDataTrain_add_feature_grp_P['SalesValuePerItem']).abs())/salesReportDataTrain_add_feature_grp_P['AverageSalesValueByProduct'])*100
salesReportDataTrain_add_feature_grp_P.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P['Suspicious'][salesReportDataTrain_add_feature_grp_P['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P['SalesValuePerItemVariationFromAvgProduct'][salesReportDataTrain_add_feature_grp_P['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of SalesValuePerItem per transaction vs AverageSalesValueByProduct",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
Is overall Quantity of of sale of a product related to quantity of sale of product in the current transaction?
salesReportDataTrain_add_feature_grp_P['perc_Quantity_by_Product'] = (
salesReportDataTrain_add_feature_grp_P['Quantity']/salesReportDataTrain_add_feature_grp_P['QuantityByProduct'])*100
salesReportDataTrain_add_feature_grp_P.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P['Suspicious'][salesReportDataTrain_add_feature_grp_P['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P['perc_Quantity_by_Product'][salesReportDataTrain_add_feature_grp_P['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of % variation in Quantity per transaction vs Total Quantity Group By Product",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
salesReportDataTrain_grp_SalesPersonID = salesReportDataTrain_add_feature_grp_P.groupby('SalesPersonID').agg(
{'TotalSalesValue':'sum','Quantity':'sum'}).reset_index().rename(columns={
'TotalSalesValue':'TotalSalesValueBySalesMan',
'Quantity':'QuantityBySalesMan'
})
salesReportDataTrain_grp_SalesPersonID['AverageSalesValueBySalesMan'] = salesReportDataTrain_grp_SalesPersonID['TotalSalesValueBySalesMan']/salesReportDataTrain_grp_SalesPersonID['QuantityBySalesMan']
salesReportDataTrain_grp_SalesPersonID.head()
# Append the AverageSalesValueByProduct to dataset
salesReportDataTrain_add_feature_grp_P_S = salesReportDataTrain_add_feature_grp_P.merge(
salesReportDataTrain_grp_SalesPersonID, on='SalesPersonID', how='outer')
salesReportDataTrain_add_feature_grp_P_S.head()
#### Get Deviation of SalesValuePerItem from AverageSalesValueBySalesMan
salesReportDataTrain_add_feature_grp_P_S['SalesValuePerItemVariationFromAvgSalesMan'] = (((salesReportDataTrain_add_feature_grp_P_S['AverageSalesValueBySalesMan'] -
salesReportDataTrain_add_feature_grp_P_S['SalesValuePerItem']).abs())/salesReportDataTrain_add_feature_grp_P_S['AverageSalesValueBySalesMan'])*100
salesReportDataTrain_add_feature_grp_P_S.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P_S['Suspicious'][salesReportDataTrain_add_feature_grp_P_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P_S['SalesValuePerItemVariationFromAvgSalesMan'][salesReportDataTrain_add_feature_grp_P_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of % variation in SalesValuePerItem per transaction vs AverageSalesValueBySalesMan",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
Is overall Quantity of of sale of a product related to quantity of sale of product in the current transaction?
salesReportDataTrain_add_feature_grp_P_S['perc_Quantity_by_SalesMan'] = (
salesReportDataTrain_add_feature_grp_P_S['Quantity']/salesReportDataTrain_add_feature_grp_P_S['QuantityBySalesMan'])*100
salesReportDataTrain_add_feature_grp_P_S.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P_S['Suspicious'][salesReportDataTrain_add_feature_grp_P_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P_S['perc_Quantity_by_SalesMan'][salesReportDataTrain_add_feature_grp_P_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of % variation in Quantity per transaction vs Total Quantity Group By SalesPersonID",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
salesReportDataTrain_grp_SalesPersonID_ProductID = salesReportDataTrain_add_feature_grp_P.groupby(['SalesPersonID','ProductID']).agg(
{'TotalSalesValue':'sum','Quantity':'sum'}).reset_index().rename(columns={
'TotalSalesValue':'TotalSalesValueBySalesManProductID',
'Quantity':'QuantityBySalesManProductID'
})
salesReportDataTrain_grp_SalesPersonID_ProductID['AverageSalesValueBySalesManProductID'] = salesReportDataTrain_grp_SalesPersonID_ProductID['TotalSalesValueBySalesManProductID']/salesReportDataTrain_grp_SalesPersonID_ProductID['QuantityBySalesManProductID']
salesReportDataTrain_grp_SalesPersonID_ProductID.head()
# Append the AverageSalesValueByProduct to dataset
salesReportDataTrain_add_feature_grp_P_and_S = salesReportDataTrain_add_feature_grp_P_S.merge(
salesReportDataTrain_grp_SalesPersonID_ProductID, on=['SalesPersonID','ProductID'], how='outer')
salesReportDataTrain_add_feature_grp_P_and_S.head()
#### Get Deviation of SalesValuePerItem from AverageSalesValueBySalesMan
salesReportDataTrain_add_feature_grp_P_and_S['SalesValuePerItemVariationFromAvgSalesManProductID'] = (((salesReportDataTrain_add_feature_grp_P_and_S['AverageSalesValueBySalesManProductID'] -
salesReportDataTrain_add_feature_grp_P_and_S['SalesValuePerItem']).abs())/salesReportDataTrain_add_feature_grp_P_and_S['AverageSalesValueBySalesManProductID'])*100
salesReportDataTrain_add_feature_grp_P_and_S.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'][salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P_and_S['SalesValuePerItemVariationFromAvgSalesManProductID'][salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_S['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of % variation in SalesValuePerItem per transaction vs SalesValuePerItem across SalesMan & Product",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
Is overall Quantity of of sale of a product grouped by SalesPersonID & ProductID related to quantity of sale of product in the current transaction?
salesReportDataTrain_add_feature_grp_P_and_S['perc_Quantity_by_SalesMan_ProductID'] = (
salesReportDataTrain_add_feature_grp_P_and_S['Quantity']/salesReportDataTrain_add_feature_grp_P_and_S['QuantityBySalesManProductID'])*100
salesReportDataTrain_add_feature_grp_P_and_S.head()
data = []
for i in range(0,len(pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious']))):
trace = {
"type": 'violin',
"x": salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'][salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'])[i]],
"y": salesReportDataTrain_add_feature_grp_P_and_S['perc_Quantity_by_SalesMan_ProductID'][salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'] == pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'])[i]],
"name": pd.unique(salesReportDataTrain_add_feature_grp_P_and_S['Suspicious'])[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Distribution of % variation in Quantity per transaction vs Total Quantity Group By SalesPersonID and ProductID",
"yaxis": {
"zeroline": False,
}
}
}
iplot(fig)
Observation
There is clear distinction in mean variation from average quantity sold by a Salesman to the quantity sold in case of each Suspicious level.
salesReportDataTrain_share_feature_to_test = salesReportDataTrain_add_feature_grp_P_and_S[['SalesPersonID', 'ProductID',
'TotalSalesValueByProduct', 'QuantityByProduct', 'AverageSalesValueByProduct',
'TotalSalesValueBySalesMan', 'QuantityBySalesMan', 'AverageSalesValueBySalesMan',
'TotalSalesValueBySalesManProductID', 'QuantityBySalesManProductID',
'AverageSalesValueBySalesManProductID']].drop_duplicates()
salesReportDataTrain_share_feature_to_test.head()
salesReportDataTrain_share_feature_to_test.shape
Identify and drop rows that are duplicate after dropping ReportID
def isDuplicatesPresent(dataF, dropSegColumn=[]):
print("The number of Rows in the Data set = "+str(dataF.shape[0]))
print("The number of Rows in the Dedup Data set = "+str(dataF.drop(dropSegColumn, axis=1).drop_duplicates().shape[0]))
isDuplicatesPresent(salesReportDataTrain, dropSegColumn = "ReportID")
salesReportDataTrain_identify_suspicious_redundancy = salesReportDataTrain.drop(
'ReportID', axis=1).drop_duplicates().groupby(
['SalesPersonID', 'ProductID', 'Quantity', 'TotalSalesValue']).count().reset_index().rename(
columns= {'Suspicious' : 'count_Suspicious'})
salesReportDataTrain_identify_suspicious_redundancy[
salesReportDataTrain_identify_suspicious_redundancy['count_Suspicious'] > 1].head()
totalRows = salesReportDataTrain.shape[0]
rowsWithOutDup = salesReportDataTrain.drop('ReportID', axis=1).drop_duplicates().shape[0]
rowsWithOutDupIncLabel = salesReportDataTrain.drop(['ReportID', 'Suspicious'], axis=1).drop_duplicates().shape[0]
trace1 = {
"x": [-350, -350, -350],
"y": [-390.0, -500.0, -610.0],
"mode": "text",
"text": [" Total Rows", " Rows WithOut Dup",
" Rows WithOut Dup (with Lable included)"],
"textfont": {
"color": "rgb(200,200,200)",
"size": 15
},
"type": "scatter"
}
trace2 = {
"x": [350, 350, 350],
"y": [-390.0, -500.0, -610.0],
"mode": "text",
"text": [totalRows, rowsWithOutDup, rowsWithOutDupIncLabel],
"textfont": {
"color": "rgb(200,200,200)",
"size": 15
},
"type": "scatter"
}
data = [trace1, trace2]
layout = {
"height": 560,
"paper_bgcolor": "rgba(44,58,71,1)",
"plot_bgcolor": "rgba(44,58,71,1)",
"shapes": [
{
"fillcolor": "rgb(32,155,160)",
"line": {
"color": "rgb(32,155,160)",
"width": 1
},
"path": "M 200.0 -340 L 90.0 -440 L -90.0 -440 L -200.0 -340 Z",
"type": "path"
},
{
"fillcolor": "rgb(28,119,139)",
"line": {
"color": "rgb(28,119,139)",
"width": 1
},
"path": "M 80.0 -450 L 25 -550 L -25 -550 L -80.0 -450 Z",
"type": "path"
},
{
"fillcolor": "rgb(182,231,235)",
"line": {
"color": "rgb(182,231,235)",
"width": 1
},
"path": "M 20 -560 L 3.5 -660 L -3.5 -660 L -20 -560 Z",
"type": "path"
}
],
"showlegend": False,
"title": "<b>Input data - Duplicate analysis</b>",
"titlefont": {
"color": "rgb(203,203,203)",
"size": 20
},
"width": 800,
"xaxis": {
"showticklabels": False,
"zeroline": False
},
"yaxis": {
"showticklabels": False,
"zeroline": False
}
}
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Observations
That are monotonically increasing or having many nulls or that doesn't have significance to analysis
Action
salesReportDataTrain_dropped_attr = salesReportDataTrain.drop(['ReportID'], axis=1)
salesReportDataTrain_dropped_attr.head()
salesReportDataTrain_dropped_attr_no_dup = salesReportDataTrain_dropped_attr.drop_duplicates()
salesReportDataTrain_dropped_attr_no_dup.shape
salesReportDataTrain_dropped_attr_no_dup.head()
plot_data_dist_across_targ(salesReportDataTrain_dropped_attr_no_dup, "Suspicious")
Observations
1. Categorical datatype: SalesPersonID, ProductID, Suspicious
2. Numeric datatype: Quantity, TotalSalesValue
salesReportDataTrain_dropped_attr_no_dup.dtypes
cat_col = ['SalesPersonID', 'ProductID', 'Suspicious']
num_col = np.setdiff1d(salesReportDataTrain_dropped_attr_no_dup.columns, cat_col).tolist()
cat_col
num_col
def covert_var(dataF, conversion_type):
if(conversion_type=="cat"):
converted_dataF = dataF.astype('category', copy=True)
if(conversion_type=="num_float"):
converted_dataF = dataF.astype('float64', copy=True)
if(conversion_type=="num_int"):
converted_dataF = dataF.astype('int64', copy=True)
return converted_dataF
#Do the datatype conversion
salesReportDataTrain_dropped_attr_no_dup[cat_col] = covert_var(salesReportDataTrain_dropped_attr_no_dup[cat_col], "cat")
salesReportDataTrain_dropped_attr_no_dup[num_col] = covert_var(salesReportDataTrain_dropped_attr_no_dup[num_col], "num_float")
salesReportDataTrain_dropped_attr_no_dup.head()
salesReportDataTrain_dropped_attr_no_dup.dtypes
def get_correlation_plot(dataF):
corr = dataF.corr()
f, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(corr, ax=ax, annot=True, cmap="YlGnBu");
get_correlation_plot(salesReportDataTrain_dropped_attr_no_dup)
def logistic_regg_Corr(dataF, target, solver='liblinear'):
"""
This is to understand correlation between continuous IV & Categorical DV.
"""
dataF_corr = dataF.dropna()
X=dataF_corr.drop(target, axis=1)
y=dataF_corr[target]
for col_cor in X.columns :
X_col_cor = X[col_cor].values.reshape(-1, 1)
clf = LogisticRegression(random_state=0, solver=solver).fit(X_col_cor, y)
print("Mean accuracy with column", col_cor, "is :", clf.score(X_col_cor, y))
salesReportDataTrain_dropped_attr_no_dup_target = pd.concat([salesReportDataTrain_dropped_attr_no_dup[num_col].reset_index(
drop=True), salesReportDataTrain_dropped_attr_no_dup['Suspicious'].reset_index(drop=True)], axis=1)
logistic_regg_Corr(salesReportDataTrain_dropped_attr_no_dup_target, "Suspicious", solver='lbfgs')
def get_chi_square_corr_score(dataF, target, prob=0.95):
"""
The Pearson’s chi-squared statistical hypothesis is an example of a test for independence between
categorical variables.
"""
le = LabelEncoder()
le.fit(dataF[target])
dataF[target] = le.transform(dataF[target])
alpha = 1.0 - prob
print('Confidence level=%.3f and significance level=%.3f \n' % (prob, alpha))
for column in dataF.columns.drop('Suspicious'):
dataF[column].fillna(dataF[column].mode()[0], inplace=True)
table = pd.crosstab(dataF[column], dataF[target])
stat, p, dof, expected = chi2_contingency(table)
"""
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0), IV & DV are dependent')
else:
print('Independent (fail to reject H0), IV & DV are independent')
"""
if p <= alpha:
#print('Dependent (reject H0), IV & DV are dependent')
print('IV : %s and DV: %s are dependent' % (column, target))
else:
#print('Independent (fail to reject H0), IV & DV are independent')
print('IV : %s and DV: %s are independent' % (column, target))
get_chi_square_corr_score(salesReportDataTrain_dropped_attr_no_dup[cat_col], 'Suspicious', prob=0.99)
Observation

If the Number of Products can be binned as others for less occurring ProductID?
def data_dist_across_val(dataF, target):
"""
Plot target variable distribution
"""
dataF_1 = dataF[target].value_counts().to_frame().rename(columns={'ProductID':'per_prod_count'})
dataF_1['percent_prod_count'] = (dataF[target].value_counts()/dataF[target].count())*100
dataF_1['cum_percent_prod_count'] = dataF_1['percent_prod_count'].cumsum()
return dataF_1
data_dist_across_val_df = data_dist_across_val(salesReportDataTrain_dropped_attr_no_dup, "ProductID").reset_index()
trace = go.Line(x=data_dist_across_val_df.index ,
y= data_dist_across_val_df.cum_percent_prod_count,
name = 'Yes',
mode= 'markers')
layout = go.Layout(title = 'Cumulative Distribution of ProductID coverage',
xaxis = dict(title = 'Index',titlefont=dict(size=10), tickfont=dict(size=7)),
yaxis = dict(title = 'Cumulative Distribution',titlefont=dict(size=10)))
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
If the Number of Products can be binned as others for less occurring SalesPersonID?
def data_dist_across_val(dataF, target):
"""
Plot target variable distribution
"""
dataF_1 = dataF[target].value_counts().to_frame().rename(columns={'SalesPersonID':'per_sales_p_count'})
dataF_1['percent_sales_p_count'] = (dataF[target].value_counts()/dataF[target].count())*100
dataF_1['cum_percent_sales_p_count'] = dataF_1['percent_sales_p_count'].cumsum()
return dataF_1
data_dist_across_val_df = data_dist_across_val(salesReportDataTrain_dropped_attr_no_dup, "SalesPersonID").reset_index()
trace = go.Line(x=data_dist_across_val_df.index ,
y= data_dist_across_val_df.cum_percent_sales_p_count,
name = 'Yes',
mode= 'markers')
layout = go.Layout(title = 'Cumulative Distribution of SalesPersonID coverage',
xaxis = dict(title = 'Index',titlefont=dict(size=10), tickfont=dict(size=7)),
yaxis = dict(title = 'Cumulative Distribution',titlefont=dict(size=10)))
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Observation
Note : As this is a base algo we are using OHE directly on Categorical columns, expecting the number of columns will explode
salesReportDataTrain_dropped_attr_no_dup.dtypes
X_train = salesReportDataTrain_dropped_attr_no_dup.drop('Suspicious', axis=1)
y_train = salesReportDataTrain_dropped_attr_no_dup['Suspicious']
num_col_split = list(X_train.select_dtypes(include='float').columns)
cat_col_split = list(X_train.select_dtypes(include='category').columns)
num_col_split
cat_col_split
def define_num_transformer(**transformers):
steps=[]
for key, value in transformers.items():
if(key == "imputer"):
steps.append((key, SimpleImputer(strategy=value)))
elif(key == "scaler"):
steps.append((key, StandardScaler()))
else:
return "Not a valid transformation"
return Pipeline(memory ='./' ,steps=steps)
def define_cat_transformer(**transformers):
steps=[]
for key, value in transformers.items():
if(key == "imputer"):
steps.append(('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')))
elif(key == "dummyfication_onehot"):
steps.append(('onehot', OneHotEncoder(handle_unknown='ignore')))
else:
return "Not a valid transformation"
return Pipeline(memory ='./' ,steps=steps)
def get_column_transformer(num_transformer=None, num_col_split=None, cat_transformer=None, cat_col_split=None):
transformer_step=[]
if(num_transformer != None):
transformer_step.append(('num', num_transformer, num_col_split))
if(cat_transformer != None):
transformer_step.append(('cat', cat_transformer, cat_col_split))
return ColumnTransformer(transformers=transformer_step)
num_transformer_step = {"scaler":"StandardScaler"}
num_transformer = define_num_transformer(**num_transformer_step)
cat_transformer_step = {"dummyfication_onehot":"OneHotEncoder"}
cat_transformer = define_cat_transformer(**cat_transformer_step)
preprocessor = get_column_transformer(num_transformer, num_col_split, cat_transformer, cat_col_split)
preprocessor
def add_to_pipeline(**final_pipeline_steps):
steps=[]
for key, value in final_pipeline_steps.items():
if(key == "preprocessor"):
steps.append(('preprocessor', value))
elif(key == "baseModel"):
steps.append(('classifier', value))
else:
return "Not a valid step"
return impipe(steps=steps)
#Pipeline steps to be added to imBalance pipeline
baseModel = LogisticRegression()
steps = {"preprocessor" : preprocessor, "baseModel": baseModel}
im_pipe_clf = add_to_pipeline(**steps)
def simple_calssification_grid_space():
return [{'classifier': [LogisticRegression()],
'classifier__C': [3,7],
'classifier__penalty': ["l1","l2"],
'classifier__class_weight': [None, "balanced"]},
{'classifier': [DecisionTreeClassifier(random_state=0)],
'classifier__max_depth': [2,3,5,9],
'classifier__class_weight': [None, "balanced"]}]
def custom_recall_score(ground_truth, predictions):
return recall_score(ground_truth, predictions, labels=['Yes'], average=None)
cust_recall_score = make_scorer(custom_recall_score, greater_is_better=True)
#Search Space for base classifier
search_space = simple_calssification_grid_space()
#Define repeated KFold
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
CV_grid_model = GridSearchCV(
estimator=im_pipe_clf, param_grid=search_space, cv= rkf, scoring=cust_recall_score, n_jobs=-1, verbose=3, refit=True)
CV_grid_model.fit(X=X_train, y=y_train)
def get_grid_summary(gs_model):
print('Best score = ', gs_model.best_score_)
print('\n')
print('Best parameter : ')
print(gs_model.best_params_)
print('\n')
print('Best Model : ')
print(gs_model.best_estimator_.get_params()['classifier'])
print('\n')
print('Grid search execution stat :')
return pd.DataFrame(gs_model.cv_results_)
get_grid_summary(CV_grid_model)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize = (8,8))
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=cust_recall_score)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="b",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
# Cross validation with 10 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
steps = {"preprocessor" : preprocessor}
im_pipe_pre_process = add_to_pipeline(**steps)
estimator = CV_grid_model.best_estimator_.get_params()['classifier']
title = r"Learning Curves"
plot_learning_curve(estimator, title, im_pipe_pre_process.fit_transform(X_train), y_train,
(0.1, 1.01), cv=cv, n_jobs=-1)
def evaluate_model(model, X, y):
y_pred = model.predict(X)
print("Accuracy score :", accuracy_score(y,y_pred))
print("classification_report :\n",classification_report(y,y_pred,digits=4))
return y_pred
y_pred = evaluate_model(CV_grid_model, X_train, y_train)
y_pred
Observations
#Drop ReportID column, as in case of train
salesReportDataTest_dropped_attr_no_dup = salesReportDataTest.drop(['ReportID'], axis=1)
# Datatype conversions
salesReportDataTest_dropped_attr_no_dup[cat_col] = covert_var(salesReportDataTest_dropped_attr_no_dup[cat_col], "cat")
salesReportDataTest_dropped_attr_no_dup[num_col] = covert_var(salesReportDataTest_dropped_attr_no_dup[num_col], "num_float")
X_test = salesReportDataTest_dropped_attr_no_dup.drop('Suspicious', axis=1)
y_test = salesReportDataTest_dropped_attr_no_dup['Suspicious']
y_test_pred = evaluate_model(CV_grid_model, X_test, y_test)
y_test_pred
def plot_confusion_matrix(y, y_pred):
lables = list(set(y))
ax= plt.subplot()
cm = confusion_matrix(y, y_pred,lables)
sns.heatmap(cm, annot=True, ax = ax, cmap="YlGnBu");
print(cm)
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(lables);
ax.yaxis.set_ticklabels(lables);
plot_confusion_matrix(y_test,y_test_pred)
Observation
Drop rows that have more than one Independent variable for the same Dependent variable
salesReportDataTrain_identify_suspicious_redundancy_1 = salesReportDataTrain.drop(
'ReportID', axis=1).drop_duplicates(keep='first')
print(salesReportDataTrain.shape)
print(salesReportDataTrain_identify_suspicious_redundancy_1.shape)
# Remove rows with more than one Independent variable for the same Dependent variable
salesReportDataTrain_identify_suspicious_redundancy = salesReportDataTrain_identify_suspicious_redundancy_1.drop_duplicates(
subset=['SalesPersonID', 'ProductID','Quantity', 'TotalSalesValue'], keep="last")
salesReportDataTrain_identify_suspicious_redundancy.shape
cat_col = ['SalesPersonID', 'ProductID', 'Suspicious']
num_col = np.setdiff1d(salesReportDataTrain_identify_suspicious_redundancy.columns, cat_col).tolist()
cat_col
num_col
salesReportDataTrain_identify_suspicious_redundancy[cat_col] = covert_var(salesReportDataTrain_identify_suspicious_redundancy[cat_col], "cat")
salesReportDataTrain_identify_suspicious_redundancy[num_col] = covert_var(salesReportDataTrain_identify_suspicious_redundancy[num_col], "num_float")
salesReportDataTrain_identify_suspicious_redundancy.head()
salesReportDataTrain_identify_suspicious_redundancy.dtypes
X_train = salesReportDataTrain_identify_suspicious_redundancy.drop('Suspicious', axis=1)
y_train = salesReportDataTrain_identify_suspicious_redundancy['Suspicious']
#Search Space for base classifier
search_space = simple_calssification_grid_space()
#Define repeated KFold
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
CV_grid_model = GridSearchCV(
estimator=im_pipe_clf, param_grid=search_space, cv= rkf, scoring=cust_recall_score, n_jobs=-1, verbose=3, refit=True)
CV_grid_model.fit(X=X_train, y=y_train)
get_grid_summary(CV_grid_model)
# Cross validation with 10 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
steps = {"preprocessor" : preprocessor}
im_pipe_pre_process = add_to_pipeline(**steps)
estimator = CV_grid_model.best_estimator_.get_params()['classifier']
title = r"Learning Curves"
plot_learning_curve(estimator, title, im_pipe_pre_process.fit_transform(X_train), y_train,
(0.1, 1.01), cv=cv, n_jobs=-1)
y_pred = evaluate_model(CV_grid_model, X_train, y_train)
y_pred
#Drop ReportID column, as in case of train
salesReportDataTest_dropped_attr_no_dup = salesReportDataTest.drop(['ReportID'], axis=1)
# Datatype conversions
salesReportDataTest_dropped_attr_no_dup[cat_col] = covert_var(salesReportDataTest_dropped_attr_no_dup[cat_col], "cat")
salesReportDataTest_dropped_attr_no_dup[num_col] = covert_var(salesReportDataTest_dropped_attr_no_dup[num_col], "num_float")
X_test = salesReportDataTest_dropped_attr_no_dup.drop('Suspicious', axis=1)
y_test = salesReportDataTest_dropped_attr_no_dup['Suspicious']
y_test_pred = evaluate_model(CV_grid_model, X_test, y_test)
y_test_pred
plot_confusion_matrix(y_test,y_test_pred)
Observation
Using the derived columns to model
# Drop duplicate rows
salesReportDataTrain_add_feature_grp_P_and_S_no_dup = salesReportDataTrain_add_feature_grp_P_and_S.drop('ReportID', axis=1).drop_duplicates()
salesReportDataTrain_add_feature_grp_P_and_S_no_dup.shape
salesReportDataTrain_add_feature_grp_P_and_S_no_dup.head()
salesReportDataTrain_add_feature_grp_P_and_S_no_dup.dtypes
# Drop not required columns
col_to_drop = ['SalesPersonID', 'ProductID',
'TotalSalesValueByProduct', 'QuantityByProduct', 'AverageSalesValueByProduct',
'TotalSalesValueBySalesMan', 'QuantityBySalesMan', 'AverageSalesValueBySalesMan',
'TotalSalesValueBySalesManProductID', 'QuantityBySalesManProductID', 'AverageSalesValueBySalesManProductID',
'SalesValuePerItemVariationFromAvgSalesManProductID', 'perc_Quantity_by_SalesMan_ProductID']
salesReportDataTrain_no_dup_col_fillter = salesReportDataTrain_add_feature_grp_P_and_S_no_dup.drop(
col_to_drop, axis=1)
salesReportDataTrain_no_dup_col_fillter.dtypes
salesReportDataTrain_no_dup_col_fillter.head()
cat_col = ['Suspicious']
num_col = np.setdiff1d(salesReportDataTrain_no_dup_col_fillter.columns, cat_col).tolist()
cat_col
num_col
salesReportDataTrain_no_dup_col_fillter[cat_col] = covert_var(salesReportDataTrain_no_dup_col_fillter[cat_col], "cat")
salesReportDataTrain_no_dup_col_fillter[num_col] = covert_var(salesReportDataTrain_no_dup_col_fillter[num_col], "num_float")
salesReportDataTrain_no_dup_col_fillter.head()
salesReportDataTrain_no_dup_col_fillter.dtypes
X_train = salesReportDataTrain_no_dup_col_fillter.drop('Suspicious', axis=1)
y_train = salesReportDataTrain_no_dup_col_fillter['Suspicious']
num_col_split = list(X_train.select_dtypes(include='float').columns)
cat_col_split = list(X_train.select_dtypes(include='category').columns)
num_col_split
cat_col_split
num_transformer_step = {"scaler":"StandardScaler"}
num_transformer = define_num_transformer(**num_transformer_step)
cat_transformer_step = {"dummyfication_onehot":"OneHotEncoder"}
cat_transformer = define_cat_transformer(**cat_transformer_step)
preprocessor = get_column_transformer(num_transformer, num_col_split, cat_transformer, cat_col_split)
preprocessor
#Pipeline steps to be added to imBalance pipeline
baseModel = LogisticRegression()
steps = {"preprocessor" : preprocessor, "baseModel": baseModel}
im_pipe_clf = add_to_pipeline(**steps)
#Search Space for base classifier
search_space = simple_calssification_grid_space()
#Define repeated KFold
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
CV_grid_model = GridSearchCV(
estimator=im_pipe_clf, param_grid=search_space, cv= rkf, scoring=cust_recall_score, n_jobs=-1, verbose=3, refit=True)
CV_grid_model.fit(X=X_train, y=y_train)
get_grid_summary(CV_grid_model)
# Cross validation with 10 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
steps = {"preprocessor" : preprocessor}
im_pipe_pre_process = add_to_pipeline(**steps)
estimator = CV_grid_model.best_estimator_.get_params()['classifier']
title = r"Learning Curves"
plot_learning_curve(estimator, title, im_pipe_pre_process.fit_transform(X_train), y_train,
(0.1, 1.01), cv=cv, n_jobs=-1)
y_pred = evaluate_model(CV_grid_model, X_train, y_train)
y_pred
salesReportDataTrain_share_feature_to_test.head()
salesReportDataTrain_share_feature_to_test.dtypes
# Get Average Product selling price per SalesPersonID. This is to fill misiing values for ProductID
salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg = salesReportDataTrain_share_feature_to_test.groupby(
'SalesPersonID').mean()
salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg.head()
salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg.shape
# Get Average Product selling price per ProductID, across SalesPersonID. This is to fill misiing values for SalesPersonID
salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg = salesReportDataTrain_share_feature_to_test.groupby(
'ProductID').mean()
salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg.head()
salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg.shape
#Get the required datset from train data to be mapped with test data for derived columns
salesReportDataTrain_share_feature_to_test_by_product = salesReportDataTrain_share_feature_to_test[
['ProductID', 'TotalSalesValueByProduct','QuantityByProduct',
'AverageSalesValueByProduct']].drop_duplicates()
salesReportDataTrain_share_feature_to_test_by_product.shape
salesReportDataTrain_share_feature_to_test_by_product.head()
salesReportDataTrain_share_feature_to_test_by_salesMan = salesReportDataTrain_share_feature_to_test[
['SalesPersonID', 'TotalSalesValueBySalesMan','QuantityBySalesMan',
'AverageSalesValueBySalesMan']].drop_duplicates()
salesReportDataTrain_share_feature_to_test_by_salesMan.shape
salesReportDataTrain_share_feature_to_test_by_salesMan.head()
salesReportDataTrain_share_feature_to_test_by_product_salesMan = salesReportDataTrain_share_feature_to_test[
['SalesPersonID','ProductID', 'TotalSalesValueBySalesManProductID','QuantityBySalesManProductID',
'AverageSalesValueBySalesManProductID']].drop_duplicates()
salesReportDataTrain_share_feature_to_test_by_product_salesMan.shape
salesReportDataTrain_share_feature_to_test_by_product_salesMan.head()
salesReportDataTest.head()
salesReportDataTest.shape
#Drop ReportID column, as in case of train
salesReportDataTest_dropped_attr_no_dup = salesReportDataTest.drop(['ReportID'], axis=1)
#Add the SalesValuePerItem column
salesReportDataTest_dropped_attr_no_dup.insert(5, 'SalesValuePerItem',
salesReportDataTest_dropped_attr_no_dup['TotalSalesValue']/salesReportDataTest_dropped_attr_no_dup['Quantity'])
#Left join here, as the distinct ProductID in train may not be same as in test
salesReportDataTest_dropped_attr_no_dup_P = salesReportDataTest_dropped_attr_no_dup.merge(
salesReportDataTrain_share_feature_to_test_by_product, on='ProductID', how='left')
salesReportDataTest_dropped_attr_no_dup_P_S = salesReportDataTest_dropped_attr_no_dup_P.merge(
salesReportDataTrain_share_feature_to_test_by_salesMan, on='SalesPersonID', how='left')
salesReportDataTest_dropped_attr_no_dup_P_and_S = salesReportDataTest_dropped_attr_no_dup_P_S.merge(
salesReportDataTrain_share_feature_to_test_by_product_salesMan, on=['SalesPersonID', 'ProductID'], how='left')
salesReportDataTest_dropped_attr_no_dup_P_and_S.head()
salesReportDataTest_dropped_attr_no_dup_P_and_S.shape
salesReportDataTest_dropped_attr_no_dup_P_and_S.dtypes
x=salesReportDataTest_dropped_attr_no_dup_P_and_S.drop([
'TotalSalesValueBySalesManProductID', 'QuantityBySalesManProductID', 'AverageSalesValueBySalesManProductID'], axis=1)
x[x.isnull().any(axis=1)].head()
'''
It means these product are not present in train data
Replace the Missing value with avg of other product details for the same salePerson.
This way it fills up value with the trend of sales by the salesman.
'''
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.TotalSalesValueByProduct.isnull(),
'TotalSalesValueByProduct'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'SalesPersonID'].map(salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg.TotalSalesValueBySalesMan)
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.QuantityByProduct.isnull(),
'QuantityByProduct'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'SalesPersonID'].map(salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg.QuantityBySalesMan)
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.AverageSalesValueByProduct.isnull(),
'AverageSalesValueByProduct'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'SalesPersonID'].map(salesReportDataTrain_share_feature_to_test_grp_by_SalesPersonID_avg.AverageSalesValueBySalesMan)
'''
It means these salesman are not present in train data
Replace the Missing value with avg of other salesman details for the diff salePerson, selling the same product
This way it fills up value with the trend of sales for the same product.
'''
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.TotalSalesValueBySalesMan.isnull(),
'TotalSalesValueBySalesMan'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'ProductID'].map(salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg.TotalSalesValueByProduct)
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.QuantityBySalesMan.isnull(),
'QuantityBySalesMan'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'ProductID'].map(salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg.QuantityByProduct)
salesReportDataTest_dropped_attr_no_dup_P_and_S.loc[
salesReportDataTest_dropped_attr_no_dup_P_and_S.AverageSalesValueBySalesMan.isnull(),
'AverageSalesValueBySalesMan'] = salesReportDataTest_dropped_attr_no_dup_P_and_S[
'ProductID'].map(salesReportDataTrain_share_feature_to_test_grp_by_ProductID_avg.AverageSalesValueByProduct)
#It means these salesman & ProductID combination is not present in train data
x=salesReportDataTest_dropped_attr_no_dup_P_and_S.drop([
'TotalSalesValueBySalesManProductID', 'QuantityBySalesManProductID', 'AverageSalesValueBySalesManProductID'], axis=1)
x[x.isnull().any(axis=1)]
# Derive variation columns
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItemVariationFromAvgProduct'] = (((salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueByProduct'] -
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItem']).abs())/salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueByProduct'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S['perc_Quantity_by_Product'] = (
salesReportDataTest_dropped_attr_no_dup_P_and_S['Quantity']/salesReportDataTest_dropped_attr_no_dup_P_and_S['QuantityByProduct'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItemVariationFromAvgSalesMan'] = (((salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueBySalesMan'] -
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItem']).abs())/salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueBySalesMan'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S['perc_Quantity_by_SalesMan'] = (
salesReportDataTest_dropped_attr_no_dup_P_and_S['Quantity']/salesReportDataTest_dropped_attr_no_dup_P_and_S['QuantityBySalesMan'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItemVariationFromAvgSalesManProductID'] = (((salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueBySalesManProductID'] -
salesReportDataTest_dropped_attr_no_dup_P_and_S['SalesValuePerItem']).abs())/salesReportDataTest_dropped_attr_no_dup_P_and_S['AverageSalesValueBySalesManProductID'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S['perc_Quantity_by_SalesMan_ProductID'] = (
salesReportDataTest_dropped_attr_no_dup_P_and_S['Quantity']/salesReportDataTest_dropped_attr_no_dup_P_and_S['QuantityBySalesManProductID'])*100
salesReportDataTest_dropped_attr_no_dup_P_and_S.head()
salesReportDataTest_dropped_attr_no_dup_P_and_S.dtypes
salesReportDataTest_dropped_attr_no_dup_P_and_S.shape
col_to_drop = ['SalesPersonID', 'ProductID',
'TotalSalesValueByProduct', 'QuantityByProduct', 'AverageSalesValueByProduct',
'TotalSalesValueBySalesMan', 'QuantityBySalesMan', 'AverageSalesValueBySalesMan',
'TotalSalesValueBySalesManProductID', 'QuantityBySalesManProductID', 'AverageSalesValueBySalesManProductID',
'SalesValuePerItemVariationFromAvgSalesManProductID', 'perc_Quantity_by_SalesMan_ProductID']
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter = salesReportDataTest_dropped_attr_no_dup_P_and_S.drop(
col_to_drop, axis=1)
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter.head()
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter.shape
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter.dtypes
# Datatype conversions
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter[cat_col] = covert_var(salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter[cat_col], "cat")
salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter[num_col] = covert_var(salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter[num_col], "num_float")
X_test = salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter.drop('Suspicious', axis=1)
y_test = salesReportDataTest_dropped_attr_no_dup_P_and_S_col_fillter['Suspicious']
y_test_pred = evaluate_model(CV_grid_model, X_test, y_test)
y_test_pred
plot_confusion_matrix(y_test,y_test_pred)